Contents

1 About

This document contains computer code to reproduce analyses and figures presented in the corresponding manuscript.

2 Dependencies

We load a number of packages which provide functionality that is required for downstream analyses.

library(broom)
library(reshape2)
library(mixtools)
library(lmtest)
library(sandwich)
library(pheatmap)
library(org.Hs.eg.db)
library(clusterProfiler)
library(fgsea)
library(umap)
library(GO.db)
library(Organism.dplyr)
library(patchwork)
library(ggrepel)
library(edgeR)
library(IHW)
library(ChIPpeakAnno)
library(BSgenome.Hsapiens.UCSC.hg19)
library(MASS)
library(Gviz)
library(GenomicRanges)
library(biomaRt)
library(org.Hs.eg.db)
library(cowplot)
library(tidyverse)

3 Context-dependent essential genes are enriched for lineage dependency transcription factors

3.1 Data

We next load a number of datasets that downstream analyses are based on. These include formatted CERES scores for the DepMap 19Q1 dataset and a list of transcription factors in the human genome (as determined by Lambert et al., 2018).

## The CERES scores for DepMap 19Q1
data('depmap_ceres_700', package='HDCRC2019')

## list of TF
data('tf_list', package='HDCRC2019')

3.2 Context-specific gene dependencies

Context-dependent gene dependency can be detected by an increased dropout compared to the ‘null distribution’ that represents the baseline phenotype of a gene knockout. To infer the null distribution, we center CERES scores for each gene. To this end we fit a Gaussian mixture model with 2 components to the distribution of CERES scores for each gene. Assuming that phenotypes for a gene knockout are similar for the majority of cell lines, we select the component that represents most of the data as the ‘null-component’ and save its parameters (mean and standard deviation).

## needs random seed for reproducibility
set.seed(1234)

pb <- progress_estimated(length(unique(depmap_ceres$symbol)))
mm_set <- depmap_ceres %>% 
  group_by(symbol) %>%
  group_modify(~ {
    pb$tick()$print()
    ## fit gaussian mixture
    mm <- tryCatch(normalmixEM(.x$cscore), 
                   error=function(cond) return(NULL))
    if(!is.null(mm)){
      ## select largest component
      comp_count <- as_tibble(mm$posterior) %>% 
        mutate(comp = ifelse(comp.1 > comp.2, 1, 2)) %>% 
        count(comp)
      compl <- comp_count %>% arrange(desc(n)) %>% 
        pull(comp) %>% head(1)
      comp_frac <- `/`(comp_count %>% filter(comp == compl) %>% pull(n),
                       sum(comp_count$n))
      ## extract paramters
      tibble(
        mu_null = mm$mu[compl],
        mu_alternative = mm$mu[-compl],
        sigma_null = mm$sigma[compl],
        sigma_alternative = mm$sigma[-compl],
        comp_size = comp_frac
      )
    } else {
      tibble(
        mu_null = NA,
        mu_alternative = NA,
        sigma_null = NA,
        sigma_alternative = NA,
        comp_size = NA
      )
    }
  }) %>% ungroup()

We next calculate P-values for each phenotype that represents the probability of having observed the phenotype under the inferred null distribution of knockout phenotypes for the same gene across all cell lines. We select gene dependencies as context-sepcific if FDR < 20%.

## calculate P-values, name context-dependent genes
depmap_ceres <- depmap_ceres %>% left_join(mm_set) %>% 
  mutate(pval_cd = 2*(pnorm(cscore, mean = mu_null, sd=sigma_null)), 
         FDR_cd = p.adjust(pval_cd, method='BH'),
         context_ess = ifelse(FDR_cd < 0.2, T, F))

We can visualize the results looking at examples of oncogenes that are expected to show contex-specific essentiality (such as KRAS, BRAF, etc.). We plot the distribution of phenotypes across all cell lines, highlight the inferred null-distribution of baseline phenotypes and highlight the 20% FDR cutoff.

## plot centered CERES scores indicating null distr
plot_example_nulldistr <- function(gene, df){
  ex_distr <- df %>% filter(symbol == gene)
  ex_mu <- ex_distr %>% pull(mu_null) %>% head(1)
  ex_sig <- ex_distr %>% pull(sigma_null) %>% head(1)
    
  ## aproximate FDR 20% cutoff
  co <- ex_distr %>% filter(FDR_cd < 0.2) %>% 
    arrange(desc(pval_cd)) %>% pull(cscore) %>% head(1)
  
  ex_distr %>% ggplot(aes(cscore)) + 
    geom_histogram(aes(y=..density..), bins=20) + 
    stat_function(fun = dnorm, colour = '#db4437', 
                  args=list(mean = ex_mu, sd = ex_sig)) + 
    scale_y_continuous(expand=c(0,0)) + 
    geom_vline(xintercept = co, linetype = 'dashed', colour='#4285f4') +
    xlab('Viability [CERES score]')
}

plot_example_nulldistr('KRAS', depmap_ceres)

plot_example_nulldistr('PIK3CA', depmap_ceres)

plot_example_nulldistr('BRAF', depmap_ceres)

plot_example_nulldistr('NRAS', depmap_ceres)

plot_example_nulldistr('MITF', depmap_ceres)

plot_example_nulldistr('CTNNB1', depmap_ceres)

4 Systematic identification of lineage-specific essential genes

4.1 Examples

In order to visualize examples we define a function that can plot these lineage depdency scores (LD-scores).

plot_mr_score <- function(gene, ctype, plot_tissues, df){
  df <- df %>% 
    filter(symbol == gene, lineage %in% plot_tissues) %>%
    mutate(cscore_sc = -1*((cscore-mu_null)/sigma_null)) %>%
    group_by(lineage) %>% mutate(mn = mean(cscore)) %>% ungroup() %>%
    arrange(mn) %>% mutate(lineage = factor(lineage, levels = unique(.$lineage))) %>% 
    mutate(highlight = ifelse(lineage %in% ctype, T, F)) %>%
    filter(!is.na(lineage))
  
  mean_line <- 0
  
  df %>%
    ggplot(aes(lineage, cscore_sc, colour=highlight)) + 
    geom_jitter(width = 0.1, color = '#dddddd') +
    stat_summary(fun.min = ~ 0, fun.max = 'mean', geom='linerange', size = 2) +
    stat_summary(fun = 'mean', fun.max = 'mean', 
                 fun.min = 'mean', geom='point', size=3) +
    geom_hline(yintercept = mean_line, colour='#4285f4') +
    theme(axis.text.x = element_text(angle=45, hjust=1),
          legend.position = 'none') + 
    scale_colour_manual(values=c('#111111', '#4285f4')) + 
    ylab('-CERES score [scaled]') + xlab('Cancer type') + 
    ggtitle(paste('Knockout gene:', gene))
}

We use these function to visualize examples for a number of different cancer lineages. We consider specifically those lineages that are represented with at least 10 different cell lines in the dependency dataset.

## select tissues represented at sufficient frequency
pt <- depmap_ceres %>% distinct(lineage, cellline) %>% 
  count(lineage) %>% arrange(desc(n)) %>% filter(n > 10) %>% 
  pull(lineage)

## draw a plot that indicates why 10 might be a reasonable threshold
depmap_ceres %>% distinct(lineage, cellline) %>% 
  count(lineage) %>% arrange(desc(n)) %>% mutate(rank = 1:n()) %>% 
  ggplot(aes(rank, n)) + geom_point() + 
  geom_hline(yintercept = 10, colour = '#db4437', linetype = 'dashed') + 
  ylab('Number of cell lines') + xlab('Lineage rank')

## plot LD-scores for tissues of interest
plot_mr_score('MITF', 'skin', pt, depmap_ceres)

plot_mr_score('CDX2', 'colorectal', pt, depmap_ceres)

plot_mr_score('EGFR', c('upper_aerodigestive', 'bile_duct', 
                        'colorectal', 'esophagus',
                        'urinary_tract', 'gastric', 'liver', 'uterus',
                        'pancreas', 'ovary', 'kidney', 'lung'), pt, depmap_ceres)

plot_mr_score('MYC', 'peripheral_nervous_system', pt, depmap_ceres)

4.2 Systematic identification of lineage survival genes using the LD-score model

Using a linear model based appraoch we now systematically calculate lineage dependency scores for each genes in each tumor type. Before that we annotate TP53 mutation status for each cell line so we can account for it during LD-score calculation. We do this because TP53 occur very frequently and previous studies have shown that TP53 status affects the gene essentiality profile of a cell line in a tissue-independent manner. The mutation status is inferred from mutation data in the Cancer Cell Line Encyclopedia and exclues silent mutations.

## load tp53 mutation status in cancer cell lines
data('tp53_status', package='HDCRC2019')

## is tp53 mutation status predictive of MDM2 phenotype?
tp53_status %>% 
  right_join(depmap_ceres %>% filter(symbol == 'MDM2') %>% 
               dplyr::select(cellline, cscore)) %>% 
  ggplot(aes(TP53_status, cscore)) + geom_jitter()

We calculate LD-scores for all genes that show a significant (CERES score < -0.5) viability phenotype in at least one cell line in the CRISPR-Cas9 dataset We exclude SOX9, which has been shown to be affected by off-target effects in the Avana library (Fortin et al., 2019) and TP53 since we account for its mutation status.

## subset of the data containing genes that show 
## essentiality in some context
ceres_ess <- depmap_ceres %>% 
  group_by(symbol) %>% filter(min(cscore) < -0.5) %>% ungroup() %>%
  filter(lineage %in% pt, !is.na(mu_null),
         !symbol %in% c('SOX9', 'TP53')) %>%
  inner_join(tp53_status) %>%
  ## center and scale
  group_by(symbol) %>% 
  mutate(cscore_sc = -1*((cscore - mu_null)/sigma_null)) %>% 
  ungroup()

## progress bar
pb <- progress_estimated(length(unique(ceres_ess$symbol)))

## run linear modelling without controlling for anything
ld_crispr <- ceres_ess %>% 
  # filter(symbol %in% c('KRAS', 'BRAF', 'CTNNB1')) %>%
  group_by(symbol) %>% 
  mutate(cscore_sc = -1*((cscore - mu_null)/sigma_null)) %>%
  group_modify(~{
    pb$tick()$print()
    fit <- lm(cscore_sc ~ 0 + lineage + TP53_status, data = .x)
    fit2 <- coeftest(fit, vcov = vcovHC(fit))
    ci95 <- confint(fit2, level = 0.95)
    tidy(fit2) %>% 
      inner_join(as_tibble(ci95, rownames = 'term'), by='term')
  }) %>%
  ungroup() %>%
  inner_join(ceres_ess %>% group_by(symbol) %>% 
               summarise(main_effect = median(cscore)) %>% 
               ungroup()) %>%
  filter(!term %in% c('TP53_statusmut', 'TP53_statuswt')) %>%
  mutate(FDR = p.adjust(p.value, method='BH')) %>%
  arrange(p.value) %>%
  mutate(lineage = gsub('^lineage', '', term)) %>% 
  dplyr::select(-term) %>%
  dplyr::select(symbol, lineage, everything())

We plot the distribution of the resulting LD-scores.

ld_crispr %>% ggplot(aes(estimate)) + 
  geom_histogram(bins=50) + 
  xlab('LD-score')

4.3 Number of specific essential genes per lineage

Is there a correlation between the number of available cell lines and tissue-specific essential genes?

## histogram
ld_crispr %>% filter(FDR < 0.05, estimate > 1) %>%
  count(lineage) %>%
  ggplot(aes(n)) + geom_histogram()

## stratified by lineage
ld_crispr %>% filter(FDR < 0.05, estimate > 1) %>% 
  count(lineage) %>% 
  inner_join(depmap_ceres %>% distinct(cellline, lineage) %>% 
               count(lineage) %>% dplyr::select(lineage, n_cl = n)) %>% 
  ggplot(aes(n, n_cl)) + geom_point() + 
  xlab('Number of specific essential genes') + 
  ylab('Number of cell lines in dataset')

There is no correlation. In fact, the tissue that is represented by the largest number of cell lines, Lung cancer, has very few tissue-specific essential genes.

4.4 Number of private and shared specific essential genes

How many specific essential genes are truly private to each tissue? Which specific essential genes are shared between lineages? Which lineages share the most essential genes?

## how many tissues is each gene specific to?
tissues_per_gene <- ld_crispr %>% 
  filter(estimate > 1, FDR < 0.05) %>% 
  group_by(symbol) %>%
  summarise(n_tis = n(), 
            max_ld = max(estimate),
            avg_ld = mean(estimate)) %>% 
  ungroup() %>%
  arrange(desc(n_tis)) %>%
  ## add mixture component size
  left_join(mm_set)

## add this information to ld-scores object
ld_crispr <- ld_crispr %>% 
  left_join(tissues_per_gene %>% dplyr::select(n_tis, symbol))

4.5 Examples of known lineage-specific essential genes

We generate a bar plot showing 3 different tissues and 2 associated known lineage-specific essential genes per tissue. Specifically, we select melanoma, colorectal cancer and leukemia.

## ld genes
genes <- c('CTNNB1', 'TCF7L2', 'MITF', 'BRAF', 'MYB', 'BCL2')

## make bar plot for selected tissues
ld_crispr %>% 
  filter(symbol %in% genes, 
         lineage %in% c('blood', 'skin', 'colorectal')) %>% 
  mutate(ymin = estimate - std.error, ymax = estimate + std.error,
         symbol = factor(symbol, levels = genes)) %>% 
  ggplot(aes(symbol, estimate, fill = lineage)) + 
  geom_bar(stat='identity', position = 'dodge') + 
  geom_errorbar(aes(ymin = ymin, ymax = ymax), position = 'dodge') + 
  scale_fill_manual(values = c('#dddddd', '#777777', '#111111'))

In many cases there is considerable variation in terms of knockout phenotypes for cell lines representing the same lineage. We show an example where we can see that this heterogeneity is sometimes explained by insufficient stratification.

## example trps1 in breast cancer
depmap_ceres %>% 
  filter(lineage == 'breast', symbol == 'TRPS1') %>% 
  ggplot(aes(lineage_molecular_subtype, cscore)) + 
  geom_jitter(width = 0.2) +
  stat_summary(fun.y = 'mean', fun.ymin = 'mean', fun.ymax = 'mean',
               geom = 'crossbar', color = '#db4437', width = 0.5) + 
  theme(axis.text.x  = element_text(angle=45, hjust=1)) + 
  ylab('CERES score')

5 Visualization lineage-specific essential processes

Which processes and pathways are specific to each lineage, where are the overlaps? We generate a large heatmap including the most lineage-specific essential genes and their effects in all cell lines that depend on any of these genes. We examine clusters in this heatmap to gain a better understanding of lineage-specific essential processes as well as differences and similarities between lineages. In the heatmap we show centered and scaled CERES scores (similar to how they were used for LD-score calculation).

## add lineage to ccle name
ceres_ess <- ceres_ess %>% 
  mutate(CCLE_Name = paste(cellline, lineage, sep='_'))

## cell lines that depend on the strongest LS-gene for their tissue
strongest_ls_gene <- ld_crispr %>% 
  group_by(lineage) %>% top_n(1, estimate) %>% ungroup() %>% 
  distinct(lineage, symbol, estimate)

ls_dep_lines <- ceres_ess %>% 
  inner_join(strongest_ls_gene) %>%
  filter(cscore_sc > 3) %>% pull(cellline)

## significant ls genes
ls_genes <- ld_crispr %>% filter(estimate > 2, FDR < 0.05) %>%
  pull(symbol) %>% unique()

## plot a heatmap
## annotate clusters
groups <- ceres_ess %>%
  filter(cellline %in% ls_dep_lines,
         symbol %in% ls_genes) %>%
  acast(CCLE_Name ~ symbol, value.var = 'cscore_sc') %>% 
  dist() %>% hclust(method = 'ward.D2') %>% 
  cutree(k = 15)

groups <- tibble(CCLE_Name = names(groups),
       cluster = as.factor(groups))

lineage_annotation <- ceres_ess %>% 
  distinct(CCLE_Name, lineage) %>% 
  inner_join(groups) %>% 
  # dplyr::select(-cluster) %>%
  mutate(lineage = as.factor(lineage)) %>%
  as.data.frame() %>% column_to_rownames('CCLE_Name')

colors <- c(rep('#67001f', 7), '#b2182b','#d6604d',
            '#f4a582','#f7f7f7','#92c5de','#4393c3',
            '#2166ac',rep('#053061', 7))

## colors for lineages 
lin_colpal <- c("#023fa5", "#7d87b9", "#bec1d4", "#d6bcc0", "#bb7784", "#8e063b", "#4a6fe3", "#8595e1", "#b5bbe3", "#e6afb9", "#e07b91", "#d33f6a", "#11c638", "#8dd593", "#c6dec7", "#ead3c6", "#f0b98d", "#ef9708", "#0fcfc0", "#9cded6", "#d5eae7", "#f3e1eb", "#f6c4e1", "#f79cd4")
ann_colors <- list(lineage = setNames(lin_colpal, levels(lineage_annotation$lineage)))

ceres_ess %>%
  filter(cellline %in% ls_dep_lines,
         symbol %in% ls_genes) %>%
  mutate(sample = paste(cellline, lineage), sep='_') %>%
  acast(symbol ~ CCLE_Name, value.var = 'cscore_sc') %>% 
  pheatmap(annotation_col = lineage_annotation,
           annotation_colors = ann_colors,
           color = colorRampPalette(colors,
                                    bias = 1.6)(100),
           clustering_method = 'ward.D2')

5.1 Expression markers for unexpected clusters

There are some clusters that are unexpected. For example, there is a cluster that contains cell lines of both epithelial and mesenchymal origin and these cell lines to not display many lineage-specific vulnerabilities. We examine whether these cell lines express genes that would explain why they cluster together despite their different origin.

data('CCLE_18q4_TPM_long_all', package = 'HDCRC2019')

What are the cell lines in that cluster?

## cell lines in cluster
c7 <- rownames(subset(lineage_annotation, cluster == '7'))
cls_c7 <- c7 %>% strsplit('_') %>% map_chr(~.x[1])
cls_hm <- rownames(lineage_annotation) %>% strsplit('_') %>% map_chr(~ .x[1])
tissues_c7 <- c7 %>% strsplit('_')  %>% map_chr(~.x[2]) %>% unique()

## expr df for testing
expr_df <- ccle_expr_all %>% filter(cellline %in% cls_hm) %>%
  dplyr::select(-tissue) %>% 
  inner_join(distinct(depmap_ceres, cellline, lineage)) %>% 
  mutate(group = ifelse(cellline %in% cls_c7, 'c7', 'c_other'))

## t.test
diff_exp_c7 <- expr_df %>% group_by(symbol) %>% 
  group_modify(~ tidy(t.test(tpm ~ group, data = .x, var.equal = T))) %>% 
  ungroup() %>% arrange(p.value) %>% 
  mutate(fdr = p.adjust(p.value, method = 'BH'))

We plot indivdiual hits as boxplots.

expr_df %>% filter(symbol %in% c('GBX2', 'CHD5')) %>% 
  ggplot(aes(group, tpm)) + 
  geom_boxplot() + 
  facet_wrap(~ symbol)

We next perform a gene set over-rpresentation test on GO BP to find relevant processes.

diff_exp <- diff_exp_c7 %>% filter(fdr < 0.05, statistic < 0) %>% pull(symbol)
universe <- diff_exp_c7 %>% pull(symbol)
enr <- enrichGO(
  gene = diff_exp,
  universe = universe,
  OrgDb = org.Hs.eg.db,
  ont = 'BP',
  pAdjustMethod = 'BH',
  qvalueCutoff = 0.05,
  keyType = 'SYMBOL'
)

emapplot(enr)

We generate a barcode plot for the MSigDb core EMT gene set.

## load msigdb emt signature
data('msigdb_emt_sig', package = 'HDCRC2019')

ranks <- diff_exp_c7 %>% filter(!is.na(statistic)) %>% arrange(desc(statistic))
ranks <- setNames(ranks$statistic, ranks$symbol)

fgsea_res <- fgsea(
  pathways = list(emt = emt_sig),
  stats = ranks,
  nperm = 1e5
)

plotEnrichment(emt_sig, ranks)

We further create a volcano plot where standard MET markers are highlighted.

emt_markers <- c('ZEB1', 'CDH2', 'VIM', 'CDH1', 'AXL')
results_emt_markers <- diff_exp_c7 %>% filter(symbol %in% emt_markers)

diff_exp_c7 %>% 
  ggplot(aes(estimate2 - estimate1, -log10(p.value))) + 
  geom_hex(bins = 100, fill = '#dddddd') + 
  geom_point(data = results_emt_markers, 
             aes(estimate2 - estimate1, -log10(p.value)),
             color = '#0f9d58') +
  geom_text_repel(data = results_emt_markers, 
                  aes(estimate2 - estimate1, -log10(p.value),
                      label = symbol)) + 
  geom_vline(xintercept = 0, linetype = 'dashed')

6 Sources of intra-lineage heterogeneity

6.1 UMAP analysis

Based on what we observed in the lineage dependency heatmap we assume that clustering cell lines by their dependency on significant lineage survival genes might reveal intra-lineage sub-structures where there are differences in lineage-specific gene essentiality. We use the UMAP algorithm to create visualizations of the cell lines in each lineage based on their dependency of lineage survival genes with the intention of identifying patterns of heterogenity in the UMAP plots.

## select significant lineage survival genes
ld_genes <- ld_crispr %>% filter(FDR < 0.05, estimate > 1) %>% 
  distinct(symbol, lineage)

## generate matrices of scaled and centered ceres scores
lin_mat <- ceres_ess %>% inner_join(ld_genes) %>% 
  split(.$lineage) %>% 
  map(~ .x %>% acast(symbol ~ cellline, value.var = 'cscore_sc') %>% 
        .[,apply(., 2, function(x)!any(is.na(x)))])

## set umap parameters
umap_params <- umap.defaults
umap_params$min_dist <- 0.01

## run umap algorihtm with set parameters for each cell line
lin_umaps <- lin_mat %>% 
  map( ~ .x %>% t() %>% umap(config = umap_params) %>% .$layout %>% 
         as_tibble(rownames = 'cellline'))

## also run pca for comparison
lin_pca <- lin_mat %>% 
  map( ~ .x %>% t() %>% prcomp() %>% .$x %>% 
         as_tibble(rownames = 'cellline') %>%
         dplyr::select(cellline, PC1, PC2))

## generate annotation of expression markers that migh explain differences
vim_expr <- ccle_expr_all %>% 
  filter(symbol %in% c('VIM', 'CDH1', 'AXL', 'LGR5', 'VIL1', 
                       'EPCAM', 'ZEB1', 'MLANA', 'CD46')) %>%
  dplyr::select(symbol, tpm, cellline) %>%
  spread(symbol, tpm)

## merge umap results and marker gene expr.
lin_umaps <- lin_umaps %>% melt() %>% as_tibble() %>% 
  spread(variable, value) %>% 
  inner_join(depmap_ceres %>% 
               dplyr::select(cellline, metastasis = primary_or_metastasis) %>%
               distinct() %>%
               mutate(metastasis = ifelse(is.na(metastasis), 'no', 
                                   ifelse(metastasis == 'Primary', 'unknown',
                                          'yes')))) %>%
  left_join(vim_expr)

## same for the pca analysis
lin_pca <- lin_pca %>% melt() %>% as_tibble() %>% 
  spread(variable, value) %>% 
  inner_join(depmap_ceres %>% 
               dplyr::select(cellline, metastasis = primary_or_metastasis) %>%
               distinct() %>%
               mutate(metastasis = ifelse(is.na(metastasis), 'no', 
                                   ifelse(metastasis == 'Primary', 'unknown',
                                          'yes')))) %>%
  left_join(vim_expr)

##create scatter plot visualization of umap components
lin_umaps %>%
  ggplot(aes(V1, V2, color = metastasis)) +
  geom_point() + 
  facet_wrap(~ L1) + 
  theme_void() + 
  panel_border() +
  scale_color_manual(values = c('#111111', '#cccccc', '#db4437')) + 
  theme(legend.position = 'none')

## similar plot for pca
lin_pca %>%
  ggplot(aes(PC1, PC2)) +
  geom_point() + 
  facet_wrap(~ L1, scales = 'free') + 
  theme_void() + 
  panel_border() +
  scale_color_manual(values = c('#111111', '#cccccc', '#db4437'))

We observe distinct clustering indicating different subtypes in some lineages. In other lineages we see more longitudinal patterns that might indicate a continuous shift in lineage dependencies. We visualize a few examples.

## visualize examples
subtypes <- c('bone', 'breast', 'lymphocyte', 'soft_tissue', 'blood')
emt <- c('gastric', 'colorectal', 'esophagus', 
         'liver', 'lung', 'esophagus',
         'pancreas', 'urinary_tract')


lin_umaps %>% filter(L1 %in% subtypes) %>%
  ggplot(aes(V1, V2)) +
  geom_point() + 
  facet_wrap(~ L1, nrow = 1) + 
  theme_void() + 
  panel_border()

## skin cancer example highlighting mlana expression
lin_umaps %>% filter(L1 == 'skin') %>% 
  left_join(depmap_ceres %>% distinct(cellline, lineage_subtype)) %>%
  ggplot(aes(V1, V2, color = MLANA)) + 
  geom_point() +
  scale_color_gradient2(low = '#111111', mid = '#dddddd', high = '#db4437',
                        midpoint = 2.5)

## colorectal cancer example highlighting mlana expression
lin_umaps %>% filter(L1 == 'colorectal') %>% 
  left_join(depmap_ceres %>% distinct(cellline, lineage_subtype)) %>%
  ggplot(aes(V1, V2, color = LGR5)) + 
  geom_point() +
  scale_color_gradient2(low = '#f7f7f7', mid = '#cccccc', high = '#111111',
                        midpoint = 2.5)

6.2 Example heatmaps

There are several nice examples for differences.

  • Differences in subtype: luminal/basal breast cancer, b-cell/t-cell lymphoma
  • Dedifferentiation events: melanoma, colorectal cancer

We generate heatmaps to visualize in more detail intra-lineage heterogeneity for selected examples, highlighting the expression of certain marker genes where appropriate.

## color scale
colors <- c(rep('#b2182b', 5),'#d6604d','#f4a582',
            '#fddbc7', rep('#f7f7f7', 2),'#d1e5f0',
            '#92c5de','#4393c3',rep('#2166ac', 5))

## breast cancer
lin_mat$breast %>% 
  pheatmap(color = colorRampPalette(colors, bias = 2.5)(150),
           clustering_method = 'ward.D2',
           border_color = '#dddddd')

## lymphoma
lin_mat$lymphocyte %>% 
  pheatmap(color = colorRampPalette(colors, bias = 2.6)(150),
           clustering_method = 'ward.D2',
           border_color = '#dddddd')

## annotation of differentiation marker
colorectal_anno <- lin_umaps %>% filter(L1 == 'colorectal') %>%
  dplyr::select(cellline, LGR5, AXL) %>% 
  filter(!is.na(AXL)) %>%
  as.data.frame() %>% column_to_rownames('cellline')

lin_mat$colorectal %>% 
  .[,colnames(.) %in% rownames(colorectal_anno)] %>%
  pheatmap(color = colorRampPalette(colors, bias = 2.0)(150),
           annotation_col = colorectal_anno,
           clustering_method = 'ward.D2',
           border_color = NA,
           annotation_colors = list(
             AXL = colorRampPalette(c('#ffffff', '#34A853'))(50),
             LGR5 = colorRampPalette(c('#ffffff', '#444444'))(50)
           ))

## annotation of differentiation marker
melanoma_anno <- lin_umaps %>% filter(L1 == 'skin') %>%
  dplyr::select(cellline, MLANA) %>% 
  filter(!is.na(MLANA)) %>%
  as.data.frame() %>% column_to_rownames('cellline')

lin_mat$skin %>% 
  .[,colnames(.) %in% rownames(melanoma_anno)] %>%
  pheatmap(color = colorRampPalette(colors, bias = 1.7)(150),
           annotation_col = melanoma_anno,
           clustering_method = 'ward.D2',
           border_color = NA,
           annotation_colors = list(
             MLANA = colorRampPalette(c('#ffffff', '#db4437'))(50)
           ))

7 Lineage dependency transcription factors are important in both, normal and cancer tissue and can have oncogenic potential

We next ask whether the lineage dependency TF that we find in the CRISPR data can be lineage survival oncogenes (Garraway and Sellers, 2006). We start by selecting significant LD-TFs with LD-score > 1 and FDR < 5%.

## map of TF-tissue associations
ldtf_crispr <- ld_crispr %>% filter(symbol %in% tf_list)
ldtf_map <- ldtf_crispr %>%
  mutate(ldtf = ifelse((FDR < 0.05) & (estimate > 1), T, F)) %>%
  distinct(symbol, tissue, ldtf) %>%
  arrange(symbol)

7.1 Do LD-TF play a role in normal lineage survival?

7.1.1 ExAC database

The ExAC database contains data about exome sequencing studies in the population. Since we are interested in non-cancer individuals here, we are going to look at the dataset that excludes samples from the TCGA. ExAC contains loss-of-function Z-scores that indicate whether or not a gene is more or less likely to retain damaging mutations in individuals. A lower likelihood indicates that there is negative selection pressure on these mutations which would further indicate that the genes play an important role.

## load Exac scores for relevant genes (in Avana library)
data('exac_scores', package='HDCRC2019')

## select significant LD-TF (1% FDR, -0.15 LD-score)
ldtf_sig <- ldtf_crispr %>%
  filter(FDR < 0.05, estimate > 1) %>% pull(symbol)

## is gene ld-tf, other tf or non-tf?
exac_ldtf <- exac %>% dplyr::select(gene, lof_z, pLI) %>%
  mutate(gene_type = ifelse(gene %in% ldtf_sig, 'LD-TF',
                     ifelse((gene %in% tf_list) & !(gene %in% ldtf_sig), 'other-TF',
                            'non-TF'))) %>%
  gather(score_type, value, lof_z, pLI)

## plot LOF z-scores
exac_ldtf %>% filter(score_type == 'lof_z') %>%
  ggplot(aes(value, colour=gene_type, linetype = gene_type)) +
  geom_density() +
  scale_colour_manual(values = c('#4285f4', '#111111', '#111111')) +
  scale_linetype_manual(values = c('solid', 'dashed', 'solid')) +
  scale_y_continuous(expand = c(0, 0)) +
  xlim(c(-5, 13)) + xlab('ExAC LOF Z-score') +
  theme(legend.position = 'bottom')

## test for statistical significance (LD-TF vs other TF)
exac_ldtf %>% filter(gene_type %in% c('other-TF', 'LD-TF'),
                     score_type == 'lof_z') %>%
  t.test(value ~ gene_type, data = ., var.equal=T)

We make a similar plot for the pLI probabilities.

exac_ldtf %>% filter(score_type == 'pLI') %>%
  ggplot(aes(value, colour=gene_type, linetype = gene_type)) +
  geom_density() +
  scale_colour_manual(values = c('#4285f4', '#111111', '#111111')) +
  scale_linetype_manual(values = c('solid', 'dashed', 'solid')) +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0)) +
  xlab('ExAC pLI score') +
  theme(legend.position = 'bottom')

8 Interactions between Wnt signaling and developmental TFs in colorectal cancer

## load ccle mutation data
data('CCLE_mutations', package = 'HDCRC2019')
## load 19q4 expression data
data('ccle_expr_colorectal_19q4', package = 'HDCRC2019')
## load list of valid genes (unique sgRNAs)
data('valid_genes', package = 'HDCRC2019')

## select colorectal cancer ld-tfs 
ld_crc <- ld_crispr %>%
  filter(symbol %in% valid_genes, lineage == 'colorectal', 
         symbol %in% c(tf_list, 'CTNNB1'))

## function for plotting
plot_ldtf_volcano <- function(df, ttype = 'skin', highlight = c()){
  ## select genes to highlight
  df_highlight <- df %>% filter(symbol %in% highlight,
                               lineage == ttype)
  
  ## estimate 1% FDR
  fdr1_co <- df %>% filter(FDR < 0.05) %>% 
    arrange(desc(FDR)) %>% pull(p.value) %>% 
    head(1) %>% log10() %>% `*`(-1)
  
  ## make volcano
  df %>% filter(lineage == ttype, !symbol %in% highlight) %>% 
    ggplot(aes(estimate, -log10(p.value))) + 
    geom_point(colour = '#dddddd') + 
    geom_point(data = df_highlight, aes(estimate, -log10(p.value)),
               colour = '#4285f4') + 
    geom_text_repel(data = df_highlight, 
                    aes(estimate, -log10(p.value), label = symbol)) + 
    geom_vline(xintercept = 0, linetype = 'dashed') +
    geom_vline(xintercept = 1, linetype = 'dashed', color = '#db4437') + 
    geom_hline(yintercept = fdr1_co, linetype = 'dashed') +
    xlab('Lineage dependency score') + ylab('P-value [-log10]')
}

## list of CRC hits
crc_tf <- ld_crc %>% filter(estimate > 1) %>% 
  top_n(10, -p.value) %>% pull(symbol)

## plot the volcano
plot_ldtf_volcano(ld_crc,
                  ttype = 'colorectal',
                  highlight = crc_tf)

8.1 Expression of colorectal lineage TFs in cell lines

I visualize the expression of colorectal lineage TFs across the different cell lines. This highlights heterogeneity - specifically between 2 groups of cell lines.

## colorectal cell lines
crc_lines <- ccle_expr_crc %>% 
  distinct(cellline) %>% pull(cellline)

## ctnnb1 ko
ctnnb1_ko <- depmap_ceres %>%
  filter(cellline %in% crc_lines, 
         symbol == 'CTNNB1') %>%
  dplyr::select(cellline, symbol, cscore) %>%
  spread(symbol, cscore) %>%
  dplyr::rename(CTNNB1_ko = CTNNB1)

## also add mutation status
cl_mut <- ccle_mut %>% 
  filter(symbol %in% c('RNF43', 'CTNNB1', 'APC'), 
         cellline %in% crc_lines) %>% 
  filter(isTCGAhotspot | isDeleterious, class != 'Splice_Site') %>%
  distinct(cellline, symbol) %>%
  mutate(has = 1) %>% 
  spread(symbol, has) %>%
  mutate_all(~ ifelse(is.na(.x), 0, .x))
  ## add Wnt dependency
  # left_join(ctnnb1_ko)

## heatmap annotation
mut_anno <- cl_mut %>% as.data.frame() %>%
  column_to_rownames('cellline')

## colors 
cols <- rev(colorRampPalette(c('#67001f','#b2182b','#d6604d',
                           '#f4a582','#fddbc7','#f7f7f7',
                           '#d1e5f0','#92c5de','#4393c3',
                           '#2166ac','#053061'))(50))

## visualize expression as a heatmap
hm <- ccle_expr_crc %>% 
  filter(symbol %in% crc_tf) %>% 
  acast(symbol ~ cellline, value.var = 'tpm') %>% 
  pheatmap(border_color = NA,
           annotation_col = mut_anno,
           clustering_method = 'ward.D2')

## heatmap with marker genes
cl_order <- hm$tree_col$labels[hm$tree_col$order]
hm2 <- ccle_expr_crc %>% 
  filter(symbol %in% c('LGR5', 'AXL', 'AXIN2')) %>%
  acast(symbol ~ cellline, value.var = 'tpm')
hm2 <- hm2[,cl_order]
pheatmap(hm2, border_color = NA, cluster_cols = F)

Do any of the clusters reflect responsiveness to Wnt knockouts?

## make a plot comparing three clusters and their Wnt-dependency
clusters <- ccle_expr_crc %>% 
  filter(symbol %in% crc_tf) %>% 
  acast(cellline ~ symbol, value.var = 'tpm') %>% 
  dist() %>% hclust(method = 'ward.D2') %>% 
  cutree(k = 2) %>% 
  melt() %>% as_tibble(rownames = 'cellline')

## ctnnb1 knockout phenotypes
clusters %>% inner_join(ctnnb1_ko) %>%
  mutate(value = ifelse(value %in% c(1,3), 1, value)) %>%
  ggplot(aes(as.factor(value), CTNNB1_ko, label = cellline)) + 
  geom_jitter(width = 0.2) + 
  geom_hline(yintercept = -0.5, linetype = 'dashed', color = '#db4437') +
  stat_summary(fun = 'mean', fun.min = 'mean', 
               fun.max = 'mean', width = 0.5,
               geom= 'crossbar', color = 'red') + 
  ggsignif::geom_signif(comparisons = list(c('1', '2')),
                        test = 't.test', 
                        test.args = list(var.equal = T))

Do we see the same clustering when we cluster cell lines in an unbiased manner based on the expression of the most variably expressed genes?

hm_unbiased <- ccle_expr_crc %>% 
  filter(symbol %in% valid_genes) %>% 
  acast(symbol ~ cellline, value.var = 'tpm') %>% 
  .[order(apply(.,1,sd), decreasing=T)[1:100],] %>%
  pheatmap(border_color = NA)

## add hnf4a/cdx2
cl_order <- hm_unbiased$tree_col$labels[hm_unbiased$tree_col$order]
hm2 <- ccle_expr_crc %>% 
  filter(symbol %in% c('HNF4A', 'CDX2')) %>%
  acast(symbol ~ cellline, value.var = 'tpm')
hm2 <- hm2[,cl_order]
pheatmap(hm2, border_color = NA, cluster_cols = F)

8.1.1 Expression in patient tumors

We check if we can also see similar gene expression clusters in colorectal cancer patient tumors from the TCGA. We might not necessarily expect similar cluster sizes, since it is known that outlier cancer types are often overrepresented in the cell line universe when they exhibit favorable growth properties. It would however, be important to see that similar clusters do exist.

## parse expression data in colorectal tumors (from TCGA)
data('colorectal_expr', package = 'HDCRC2019')

I first test, whether the expression of CDX2 and other lineage TF are correlated.

cdx2_expr <- colorectal_expr %>% filter(symbol == 'CDX2') %>% 
  dplyr::select(sample, cdx2 = rpkm)

## coexpression of CDX2 and other TF
colorectal_expr %>% 
  filter(symbol %in% c('HNF4A', 'KLF5', 'GATA6')) %>% 
  inner_join(cdx2_expr) %>% 
  ggplot(aes(rpkm, cdx2)) + 
  geom_point(aes(colour = sample_type)) + 
  geom_smooth(method='lm') + 
  facet_wrap(~ symbol, scales = 'free_x')

While I don’t see an effect for GATA6, the results look promising for KLF5 and especially HNF4A. I check globally, which genes are most correlated with CDX2 expression in tumors.

cdx2_cor <- colorectal_expr %>% filter(sample_type == 'tumor') %>% 
  inner_join(cdx2_expr) %>% 
  group_by(symbol) %>% summarise(pcc = cor(rpkm, cdx2)) %>% ungroup() %>% 
  drop_na()

I see a lot of differentiation markers, AXIN2, and in addition several essential colorectal lineage TFs.

8.2 CTNNB1-targets by RNA-seq

We load a count table from RNA-sequencing experiments that were performed in the Boutros lab where gene expression response to CTNNB1 konckdown by siRNA was measured. We also generate an additional table of normalized counts that we can use for clustering/visualization and comparison with DepMap TPM datasets.

## this loads the 'b110_counts' object (long format)
data('rnaseq_counts', package = 'HDCRC2019')

## Gene expression matrix of raw counts
b110_mat_raw <- b110_counts %>% 
  acast(GeneRef ~ cellline + knockdown, value.var = 'count') %>%
  .[complete.cases(.),]

## edgeR normalize cpm
dge <- DGEList(b110_mat_raw) %>% calcNormFactors()
cpm <- dge %>% edgeR::cpm()

We also generate a sample info data frame that we can use to annotate samples in various visualizations.

sample_info <- b110_counts %>% distinct(cellline, knockdown) %>%
  mutate(sample = paste(cellline, knockdown, sep='_')) %>% 
  separate(knockdown, c('knockdown', 'rep'), sep='_')

8.2.1 Quality control

We perform a principal component analysis to observe whether samples cluster as expected.

## perform pca 
pca <- prcomp(t(log(cpm+1)))

## data frame of PC values
pca_x <- pca$x %>% as_tibble(rownames='sample') %>% 
  dplyr::select(sample, PC1:PC5) %>% inner_join(sample_info)

## plot
pca_x %>% 
  ggplot(aes(PC1, PC2, colour = cellline, shape=knockdown)) +
  geom_point()

This looks good. The cell line clearly has a much larger impact on the transcriptome than the knockdown which is expected. Similar to what we saw in the expression heatmap, colo320 seems to be an outlier cell line.

We next plot a number of gold-standard Wnt target genes to see whether knockdowns worked as intended.

## cpm as long df
cpm_long <- cpm %>% as_tibble(rownames='symbol') %>% 
  gather(sample, cpm, -symbol) %>%
  inner_join(sample_info)

cpm %>% as_tibble(rownames='symbol') %>% 
  filter(symbol %in% c('AXIN2', 'CTNNB1')) %>% 
  gather(sample, cpm, -symbol) %>% 
  inner_join(sample_info) %>% 
  filter(knockdown %in% c('CTRL', 'CTNNB1')) %>%
  mutate(knockdown = factor(knockdown, levels = c('CTRL', 'CTNNB1')),
         symbol = factor(symbol, levels = c('CTNNB1', 'AXIN2', 'SACS'))) %>%
  ggplot(aes(knockdown, cpm)) + 
  stat_summary(fun.y = 'mean', geom='bar') + 
  geom_point(aes(color = rep)) + 
  facet_grid(symbol ~ cellline, scales = 'free_y') + 
  theme(axis.text.x = element_text(angle=45, hjust=1))

8.2.2 Differential gene expression

I use edgeR to perform differential gene expression analysis for each cell line individually.

## match sample info and matrix columns
sinfo <- sample_info %>% arrange(sample) %>% 
  mutate(experiment = gsub('_r\\d$', '', sample))
identical(colnames(dge), sinfo$sample)

## loop for each cell line, perform DGE.
b110_dge_res <- sinfo %>%
  filter(cellline %in% c('HCT116', 'HT29', 'DLD1')) %>%
  group_by(cellline) %>% 
  group_modify(~ { 
    ## model matrix
    mm <- model.matrix(~ 0 + knockdown, data = .x)
    
    ## expression matrix sub-sampled
    mat <- dge[,colnames(dge) %in% .x$sample]
    
    ## check for compatibility of sinfo and expr mat
    print(identical(.x$sample, colnames(mat)))
    
    ## make contrasts
    contr <- makeContrasts(CTNNB1=knockdownCTNNB1-knockdownCTRL,
                           levels=mm)
    
    ## fit model, perform test
    fit <- glmQLFit(estimateDisp(mat, mm, robust=T), mm)
    tibble(knockdown = colnames(contr)) %>% 
      mutate(dge_res = map(knockdown, ~ glmQLFTest(fit, contrast = contr[,.x]) %>%
                             topTags(n=Inf) %>% as.data.frame() %>%
                             as_tibble(rownames='symbol'))) %>%
      unnest(dge_res) %>%
      mutate(FDR_ihw = adj_pvalues(ihw(PValue, logCPM, alpha = 0.2)))
  }) %>%
  ungroup()

For the cell lines where we only have 1 replicate we have to guess the dispersion value based on the experiments where replicates are available. Relying purely on the analysis of fold changes is not desireable, since extreme fold changes are likely observed at random for genes that are expressed at low levels.

We first perform DGE analysis for cell lines where two replicates available.

est_disp <- map(c('HCT116', 'HT29', 'DLD1'), function(cl){
  ## what changes upon CTNNB1 knockdown compared to CTRL in HT29
  subinfo_ht29 <- sinfo %>% filter(cellline == cl)
  mm <- model.matrix(~ knockdown, data = subinfo_ht29)
  
  ## sub-matrix
  submat_ht29 <- dge[,grepl(cl, colnames(dge))]
  
  ## I fit the model
  fit <- estimateGLMCommonDisp(submat_ht29, mm, verbose=T)
  
  ## return bcv
  return(sqrt(fit$common.dispersion))
})

## average dispersion is the estimate I use
est_disp <- mean(unlist(est_disp))

Now we can use the average estimated dispersion in these cell lines to estimate differential gene expression in other cell lines where we have one replicate only.

b110_dge_results_1rep <- map_df(c('CACO2', 'COLO32', 'SW480'), function(cl){
  ## test with estimate dispersion
  onerep_test <- exactTest(DGEList(dge[,grepl(paste0(cl, '_C'), colnames(dge))], 
                                   group = 1:2), 
                         pair = 1:2,
                         dispersion=est_disp^2)
  
  ## return results
  onerep_test %>% topTags(n=Inf) %>% as.data.frame() %>% 
    as_tibble(rownames='symbol') %>%
    mutate(FDR_ihw = adj_pvalues(ihw(PValue, logCPM, alpha = 0.2)),
           cellline = cl,
           logFC = -1 * logFC)
  
})

## combine with previous results
b110_dge_res <- b110_dge_res %>% 
  bind_rows(b110_dge_results_1rep %>% 
              mutate(knockdown = 'CTNNB1')) %>%
  mutate(cellline = ifelse(cellline == 'COLO32', 'COLO320', cellline))

We annotate whether the genes are essential in their respective lines (if information available).

b110_dge_res <- b110_dge_res %>% left_join(
  depmap_ceres %>% 
    filter(cellline %in% c('HT29', 'DLD1', 'HCT116',
                           'COLO320', 'SW480', 'CACO2')) %>% 
    dplyr::select(cellline, symbol, cscore)
)

We can check a few selected results by plotting.

cpm_long %>% 
  filter(symbol == 'HDC', knockdown %in% c('CTRL', 'CTNNB1')) %>% 
  ggplot(aes(knockdown, cpm)) + 
  stat_summary(fun.y = 'mean', geom = 'bar') + 
  geom_point(aes(colour = rep)) + 
  facet_wrap(~ cellline)

We make volcano plots for each test. WeI also perform gene set enrichment analysis to identify regulated processes.

## volcano plots
b110_dge_res %>% 
  ggplot(aes(logFC, -log10(PValue), colour = logCPM)) + 
  ggrastr::geom_point_rast() + 
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  facet_wrap(~cellline) + 
  scale_colour_gradient(low = '#f7f7f7', high = '#4285f4')

## go biological process
src <- src_organism('TxDb.Hsapiens.UCSC.hg38.knownGene')
goterms <- tbl(src, 'id') %>% distinct(symbol, entrez) %>% 
  filter(symbol %in% local(b110_dge_res$symbol)) %>% 
  inner_join(tbl(src, 'id_go') %>% filter(ontology == 'BP')) %>% 
  collect(n=Inf) %>% mutate(go_term = Term(GOTERM)[go])
fgsea_go <- goterms %>% split(.$go_term) %>% map(~ unique(.x$symbol))

## go enrichment
enr_process <- b110_dge_res %>% 
  group_by(cellline) %>%
  mutate(s = -sign(logFC)*(-log10(PValue))) %>%
  arrange(s) %>%
  group_modify(~{
    fgsea_ranks <- setNames(.x$s, .x$symbol)
    fgsea_results <- fgsea(pathways = fgsea_go, 
                          stats = fgsea_ranks, 
                          minSize = 15, maxSize = 500, 
                          nperm = 1e4) 
    return(as_tibble(fgsea_results))
  }) %>% 
  ungroup()

8.2.2.1 Overlap between cell lines

While the analysis and experiments worked it seems at first glance that there are strong differences between the Wnt target genes for different cell lines. We quantify the overlap between individual cell lines by correlation and by Venn diagrams (after specifying a P-value cutoff).

## list of commonly mutated genes
mut_genes <- c('TP53', 'KRAS', 'BRAF', 'APC', 'CTNNB1')

## mutations of key cancer genes in our cell lines
cl_mut <- ccle_mut %>% 
  filter(symbol %in% mut_genes, 
         cellline %in% b110_dge_res$cellline) %>% 
  filter(isTCGAhotspot | isDeleterious, class != 'Splice_Site') %>%
  distinct(cellline, symbol) %>%
  mutate(has = 1) %>% 
  spread(symbol, has) %>%
  mutate_all(~ ifelse(is.na(.x), 0, .x)) %>%
  ## add dld1 manually
  bind_rows(
    tribble(
      ~cellline, ~APC, ~BRAF, ~CTNNB1, ~KRAS, ~TP53,
      'DLD1', 1, 0, 0, 1, 0,
    )
  )

## a mutation annotation for cell lines
mut_anno <- cl_mut %>% dplyr::select(cellline, APC, CTNNB1) %>% 
  mutate(MSI = as.factor(ifelse(cellline %in% c('DLD1', 'HCT116', 'RKO'), 
                      'MSI', 'MSS'))) %>%
  as.data.frame() %>% column_to_rownames('cellline')

## a LD-TF expression annotation
expr_anno <- ccle_expr_crc %>% 
  group_by(symbol) %>% mutate(tpm_scaled = scale(tpm)[,1]) %>% ungroup() %>%
  filter(cellline %in% b110_dge_res$cellline,
         symbol %in% c('CDX2', 'HNF4A', 'GATA6', 'KLF5')) %>% 
  dplyr::select(symbol, cellline, tpm_scaled) %>% 
  spread(symbol, tpm_scaled) %>% 
  as.data.frame() %>% column_to_rownames('cellline')

## colors to visulaize expr. annotation effects
expr_anno_colors <- list(
  CDX2 = colorRampPalette(c('#ffffff', '#4285f4'))(50),
  HNF4A = colorRampPalette(c('#ffffff', '#4285f4'))(50),
  GATA6 = colorRampPalette(c('#ffffff', '#4285f4'))(50),
  KLF5 = colorRampPalette(c('#ffffff', '#4285f4'))(50),
  MSI = c(MSS = '#ffffff', MSI = '#f4b400'),
  APC = c('#ffffff', '#db4437'),
  CTNNB1 = c('#ffffff', '#db4437')
)

## exclude genes that are not expressed in any of the lines
not_expr <- rownames(b110_mat_raw)[b110_mat_raw %>% apply(1, function(x) !any(x > 10))]

## top hundred targets per line
top100 <- b110_dge_res %>% filter(knockdown == 'CTNNB1') %>% 
  group_by(cellline) %>% top_n(100, -PValue) %>% ungroup() %>% 
  pull(symbol)

## correlation heatmap
colors <- c(rep('#fff5f0',4),'#fee0d2','#fcbba1',
            '#fc9272','#fb6a4a','#ef3b2c',
            '#cb181d','#a50f15', rep('#67000d', 7))

b110_dge_res %>% filter(!symbol %in% not_expr, knockdown == 'CTNNB1') %>% 
  filter(symbol %in% top100) %>%
  mutate(signif = sign(logFC) * -log10(PValue)) %>%
  acast(symbol ~ cellline, value.var = 'signif') %>% 
  cor(method = 'pearson') %>% 
  pheatmap(clustering_method = 'ward.D2',
           color = colorRampPalette(colors)(50),
           annotation_col = mut_anno,
           annotation_row = expr_anno,
           annotation_colors = expr_anno_colors)

8.3 Genome-wide targets of colorectal cancer cells in LOVO

A large number of TF binding sites were previously profiled in the LOVO colorectal cancer cell line (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE49402#4). We want to know whether HNF4A/CDX2/GATA6 occupy the same genomic locations as TCF7L2. We analyse these data accordingly. We first parse the peaks for different experiments.

data('lovo_chip_data', package = 'HDCRC2019')

8.4 Overlapping peaks between TFs

## get peak ranges for each tf in lovo
data('peak_ranges', package = 'HDCRC2019')

## convert to list
peak_ranges_list <- peak_ranges$peak_ranges
names(peak_ranges_list) <- gsub('.bed', '', basename(peak_ranges$bed_files))

## read count per peak
for(i in 1:length(peak_ranges_list)){
  peak_ranges_list[[i]]$nreads <- 
    chip_data[chip_data$name %in% names(peak_ranges_list[[i]]),'tags']$tags
}

## fraction of tcf7l2 compared to sequence covered by all peaks
frac_tcf7l2 <- chip_data %>% filter(symbol == 'TCF7L2') %>% 
  summarise(total_peak_length = sum(length)) %>%
  mutate(frac = total_peak_length/sum(width(Reduce(union, peak_ranges_list)))) %>% 
  pull(frac)
  
## calculate overlap between TCF7L2 and other TF pairs
walk(names(peak_ranges_list), ~ {
  ovlp2 <- findOverlaps(peak_ranges_list[[.x]], peak_ranges_list$TCF7L2)
  
  print(.x)
  cat(sprintf( "%d of %d (%g percent) promoters are overlapped by an enriched region.\n",
    length(unique(subjectHits(ovlp2))), length(peak_ranges_list$TCF7L2),
    round(length(unique(subjectHits(ovlp2)))/length(peak_ranges_list$TCF7L2)*100, 2)))
  
  ## binomial test
  print(binom.test(
    length(unique(subjectHits(ovlp2))), 
    length(peak_ranges_list[[.x]]), 
    frac_tcf7l2)$p.value)
})

8.5 Visualization of co-binding patterns

## first we load human gene ranges
data('gene_ranges', package = 'HDCRC2019')

## list of promoter regions (+/- 200bp around TSS)
promoter_regions <- GRanges(
  seqnames = Rle( paste0('chr', gene_ranges$chr) ),
  ranges = IRanges( start = gene_ranges$tss - 200,
                            end = gene_ranges$tss + 200 ),
          strand = Rle( rep("*", nrow(gene_ranges)) ),
          gene = gene_ranges$symbol)

## get top tcf7l2 peaks
tcf7l2_top500 <- chip_data %>% filter(symbol == 'TCF7L2') %>% 
  top_n(1000, `-10*log10(pvalue)`) %>% 
  pull(`-10*log10(pvalue)`) %>% min()
tcf7l2_top500 <- peak_ranges_list$TCF7L2[peak_ranges_list$TCF7L2$score > tcf7l2_top500]

## which promoters overlap with top TCF7L2 peaks
tcf7l2_tss <- chip_data %>% filter(symbol == 'TCF7L2') %>% 
  arrange(desc(`-10*log10(pvalue)`)) %>%
  dplyr::slice(1:500) %>%
  mutate(peak_center = round(end - ((end-start)/2), 0), 
         strand = '1',
         peak_name = paste(symbol, 1:n(), sep='_')) %>%
  filter(chr %in% as.character(1:22))

## genome information
genome <- BSgenome.Hsapiens.UCSC.hg19
si <- seqinfo(genome)
si <- si[ paste0('chr', c(1:22, 'X', 'Y'))]

## tiling
tiles <- sapply( 1:nrow(tcf7l2_tss), function(i)
   if( tcf7l2_tss$strand[i] == '1' )
      tcf7l2_tss$peak_center[i] + seq(-2500, 2400, length.out=30)
   else
      tcf7l2_tss$peak_center[i] + seq( 2400, -2500, length.out=30 ))

tiles <- GRanges(tilename = paste( rep( tcf7l2_tss$peak_name, each=30), 1:30, sep='_' ),
                seqnames = Rle( rep(paste0('chr', tcf7l2_tss$chr), each=30) ), 
                ranges = IRanges(start = as.vector(tiles),
                                 width = 30),
                strand = Rle(rep("*", length(as.vector(tiles)))),
                seqinfo=si)

## make heat maps
walk(c('TCF7L2', 'KLF5', 'HNF4A', 'GATA6', 'CDX2', 'MYBL2', 'SREBF1', 'LEF1'), ~ {
  ## number of reads per overlapping peak
  ol <- findOverlaps( tiles, peak_ranges_list[[.x]])
  num_vec <- rep(0, length(tiles))
  num_vec[queryHits(ol)] <- peak_ranges_list[[.x]][subjectHits(ol)]$nreads
  tcf7l2_p_mat <- matrix( num_vec, nrow=nrow(tcf7l2_tss), 
                             ncol=30, byrow=TRUE )
  
  ## plot results as heatmap
  colors <- colorRampPalette(c('#ffffff', '#9ecae1','#4292c6',
                               rep('#08306b', 15)))(100) 
  image(x=seq(-2500, 2500, length.out=30),
        y=1:nrow(tcf7l2_p_mat),
        z = t(tcf7l2_p_mat),
        col=colors,
        xlab='Distance from TCF7L2 (bp)',
        ylab='Promoters', lwd=2,
        main = .x)
  box(col='black', lwd=2)
  abline(v=0, lwd=1, col='gray')
})

9 Session info

sessionInfo()